#Loading Libraries
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(boot)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmboot)
library(lattice)
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:boot':
##
## melanoma
library(caret)
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(naniar)
library(utils)
library(stats)
##Reading in Dataset
setwd("/Users/xaviermojica/Desktop/Stats2/Project1") #/Users/xaviermojica/
life = read.csv("Life Expectancy Data (1).csv")
ggplot(data = life) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
## Warning: Removed 453 rows containing missing values (`geom_point()`).
##Upon looking at the graph of the original data set, it appears that
there needs to be a log transformation on the X or the GDP as we are
interested in seeing the relation between Life Expenctancy and GDP.
##Checking Data Types
str(life)
## 'data.frame': 2938 obs. of 22 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : int 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
## $ Status : chr "Developing" "Developing" "Developing" "Developing" ...
## $ Life.expectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult.Mortality : int 263 271 268 272 275 279 281 287 295 295 ...
## $ infant.deaths : int 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage.expenditure : num 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis.B : int 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : int 1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
## $ BMI : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.five.deaths : int 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : int 6 58 62 67 68 66 63 64 63 58 ...
## $ Total.expenditure : num 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : int 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 584.3 612.7 631.7 670 63.5 ...
## $ Population : num 33736494 327582 31731688 3696958 2978599 ...
## $ thinness..1.19.years : num 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness.5.9.years : num 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income.composition.of.resources: num 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
vis_miss(life)
dim(life)
## [1] 2938 22
View(life)
#Imputing using Median
#GDP 15% [17] keep GDP to have it Imputed even if quite high percentage, assuming it is crucial to predicting Life.expectancy as richer countries have better health access/Medicine and tech. The numbers appear to be GDP per capita which helps as it addresses GDP/Population. GDP per Capita and Population would be too closely related and prob attribute to covariance.
#Adjusting text angle to vis_miss
imputeMedian= preProcess(life[,-c(1:4,9)],method="medianImpute") #predictors 1:4, 9 and response is 4
cleandataMedian = predict(imputeMedian,newdata=life)
dim(cleandataMedian)
## [1] 2938 22
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
#Literature says that over 10% missing data can contribute to bias
#HepatitsB [9] at 19% , Population 22% [18].
#Removing columns 9 and 18
cleandataMedian = cleandataMedian[,-c(18,9)]
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
#removing last NA
cleandataMedian = na.omit(cleandataMedian)
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
#Converting GDP to Log
cleandataMedian$logGDP = log(cleandataMedian$GDP)
#converting Life.expectancy to log
cleandataMedian$logLife.expectancy = log(cleandataMedian$Life.expectancy)
#Log transformation on GDP
ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = logGDP, y = logLife.expectancy))
#Imputing and Removing
#Imputing all save for columns 1:4 and Removing last Na in
imputeMedian= preProcess(life[,-c(1:4)],method="medianImpute") #predictors 1:4 and response is 4
cleandataMedian1 = predict(imputeMedian,newdata=life)
dim(cleandataMedian1)
## [1] 2938 22
vis_miss(cleandataMedian1) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
#removing last NA <0,1%
cleandataMedian1 = na.omit(cleandataMedian1)
vis_miss(cleandataMedian1) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
dim(cleandataMedian1)
## [1] 2928 22
#Multivariable Plots
library(ISLR)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(GGally)
library(ggplot2)
ggpairs(cleandataMedian[,5:22], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
#ggpairs(cleandataMedian[,5:22], upper = list(continuous = wrap("cor", size = 4.75, align_percent = 1)))
#ggscatmat(cleandataMedian, columns = 5:22)
ggplot(data = cleandataMedian1) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
#log transformation on GDP
ggplot(data = cleandataMedian1) + geom_point(mapping = aes(x = log(GDP), y = Life.expectancy))
#Residuals for chosen MLR model
eightVar = lm(Life.expectancy~HIV.AIDS+Schooling+Alcohol+BMI+Polio+Diphtheria+logGDP+thinness..1.19.years+Income.composition.of.resources, data = cleandataMedian)
summary(eightVar)
##
## Call:
## lm(formula = Life.expectancy ~ HIV.AIDS + Schooling + Alcohol +
## BMI + Polio + Diphtheria + logGDP + thinness..1.19.years +
## Income.composition.of.resources, data = cleandataMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.4403 -2.5682 0.0843 2.7047 18.8786
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.994510 0.557803 77.078 < 2e-16 ***
## HIV.AIDS -0.678646 0.017783 -38.164 < 2e-16 ***
## Schooling 0.804939 0.048463 16.609 < 2e-16 ***
## Alcohol 0.055757 0.026443 2.109 0.0351 *
## BMI 0.054950 0.005583 9.842 < 2e-16 ***
## Polio 0.035731 0.005047 7.079 1.81e-12 ***
## Diphtheria 0.045743 0.005000 9.148 < 2e-16 ***
## logGDP 0.616229 0.063235 9.745 < 2e-16 ***
## thinness..1.19.years -0.120226 0.024449 -4.917 9.26e-07 ***
## Income.composition.of.resources 7.369419 0.717204 10.275 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.634 on 2918 degrees of freedom
## Multiple R-squared: 0.7639, Adjusted R-squared: 0.7632
## F-statistic: 1049 on 9 and 2918 DF, p-value: < 2.2e-16
confint(eightVar)
## 2.5 % 97.5 %
## (Intercept) 41.900781759 44.08823816
## HIV.AIDS -0.713513984 -0.64377855
## Schooling 0.709914467 0.89996366
## Alcohol 0.003908609 0.10760635
## BMI 0.044002477 0.06589743
## Polio 0.025833590 0.04562761
## Diphtheria 0.035938319 0.05554730
## logGDP 0.492238370 0.74021949
## thinness..1.19.years -0.168165244 -0.07228603
## Income.composition.of.resources 5.963142186 8.77569544
#Visuals for Residuals
plot(eightVar)
#Forward, Backward, Stepwise Selection
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#Full Model
set.seed(1246)
fitFull = lm(Life.expectancy~Adult.Mortality +
infant.deaths + Alcohol + percentage.expenditure + Measles +
BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP,cleandataMedian)
stepup = stepAIC(fitFull, direction = "forward", steps = 2000)
## Start: AIC=8219.89
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
stepdown= stepAIC(fitFull, direction = "backward", steps = 2000)
## Start: AIC=8219.89
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
##
## Df Sum of Sq RSS AIC
## - thinness.5.9.years 1 2.3 47947 8218.0
## <none> 47945 8219.9
## - thinness..1.19.years 1 72.4 48017 8222.3
## - Total.expenditure 1 96.8 48041 8223.8
## - Measles 1 139.7 48084 8226.4
## - Alcohol 1 286.9 48231 8235.4
## - percentage.expenditure 1 329.8 48274 8238.0
## - Polio 1 617.7 48562 8255.4
## - logGDP 1 723.7 48668 8261.8
## - Diphtheria 1 1005.8 48950 8278.7
## - BMI 1 1054.2 48999 8281.6
## - Income.composition.of.resources 1 1155.8 49100 8287.6
## - infant.deaths 1 2421.5 50366 8362.2
## - under.five.deaths 1 2445.4 50390 8363.5
## - Schooling 1 3913.0 51858 8447.6
## - Adult.Mortality 1 10813.5 58758 8813.4
## - HIV.AIDS 1 11560.1 59505 8850.4
##
## Step: AIC=8218.03
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP
##
## Df Sum of Sq RSS AIC
## <none> 47947 8218.0
## - Total.expenditure 1 95.5 48042 8221.9
## - Measles 1 141.0 48088 8224.6
## - thinness..1.19.years 1 229.8 48177 8230.0
## - Alcohol 1 285.9 48233 8233.4
## - percentage.expenditure 1 330.1 48277 8236.1
## - Polio 1 616.7 48564 8253.4
## - logGDP 1 721.4 48668 8259.8
## - Diphtheria 1 1008.9 48956 8277.0
## - BMI 1 1056.2 49003 8279.8
## - Income.composition.of.resources 1 1156.5 49103 8285.8
## - infant.deaths 1 2438.7 50386 8361.3
## - under.five.deaths 1 2457.7 50405 8362.4
## - Schooling 1 3919.7 51867 8446.1
## - Adult.Mortality 1 10813.7 58761 8811.5
## - HIV.AIDS 1 11559.5 59506 8848.4
stepboth = stepAIC(fitFull, direction = "both", steps = 2000)
## Start: AIC=8219.89
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
##
## Df Sum of Sq RSS AIC
## - thinness.5.9.years 1 2.3 47947 8218.0
## <none> 47945 8219.9
## - thinness..1.19.years 1 72.4 48017 8222.3
## - Total.expenditure 1 96.8 48041 8223.8
## - Measles 1 139.7 48084 8226.4
## - Alcohol 1 286.9 48231 8235.4
## - percentage.expenditure 1 329.8 48274 8238.0
## - Polio 1 617.7 48562 8255.4
## - logGDP 1 723.7 48668 8261.8
## - Diphtheria 1 1005.8 48950 8278.7
## - BMI 1 1054.2 48999 8281.6
## - Income.composition.of.resources 1 1155.8 49100 8287.6
## - infant.deaths 1 2421.5 50366 8362.2
## - under.five.deaths 1 2445.4 50390 8363.5
## - Schooling 1 3913.0 51858 8447.6
## - Adult.Mortality 1 10813.5 58758 8813.4
## - HIV.AIDS 1 11560.1 59505 8850.4
##
## Step: AIC=8218.03
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP
##
## Df Sum of Sq RSS AIC
## <none> 47947 8218.0
## + thinness.5.9.years 1 2.3 47945 8219.9
## - Total.expenditure 1 95.5 48042 8221.9
## - Measles 1 141.0 48088 8224.6
## - thinness..1.19.years 1 229.8 48177 8230.0
## - Alcohol 1 285.9 48233 8233.4
## - percentage.expenditure 1 330.1 48277 8236.1
## - Polio 1 616.7 48564 8253.4
## - logGDP 1 721.4 48668 8259.8
## - Diphtheria 1 1008.9 48956 8277.0
## - BMI 1 1056.2 49003 8279.8
## - Income.composition.of.resources 1 1156.5 49103 8285.8
## - infant.deaths 1 2438.7 50386 8361.3
## - under.five.deaths 1 2457.7 50405 8362.4
## - Schooling 1 3919.7 51867 8446.1
## - Adult.Mortality 1 10813.7 58761 8811.5
## - HIV.AIDS 1 11559.5 59506 8848.4
#summary for each model
up = summary(stepup)
up
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP, data = cleandataMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2860 -2.1523 0.0363 2.3535 15.6953
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.182e+01 6.254e-01 82.858 < 2e-16 ***
## Adult.Mortality -2.026e-02 7.907e-04 -25.623 < 2e-16 ***
## infant.deaths 1.004e-01 8.282e-03 12.125 < 2e-16 ***
## Alcohol 1.007e-01 2.412e-02 4.173 3.09e-05 ***
## percentage.expenditure 2.117e-04 4.730e-05 4.475 7.94e-06 ***
## Measles -2.229e-05 7.656e-06 -2.912 0.00362 **
## BMI 3.993e-02 4.990e-03 8.000 1.77e-15 ***
## under.five.deaths -7.461e-02 6.123e-03 -12.185 < 2e-16 ***
## Polio 2.718e-02 4.439e-03 6.124 1.03e-09 ***
## Total.expenditure 8.267e-02 3.409e-02 2.425 0.01538 *
## Diphtheria 3.461e-02 4.428e-03 7.815 7.64e-15 ***
## HIV.AIDS -4.658e-01 1.758e-02 -26.493 < 2e-16 ***
## thinness..1.19.years -1.059e-01 5.051e-02 -2.096 0.03616 *
## thinness.5.9.years 1.869e-02 4.979e-02 0.375 0.70745
## Income.composition.of.resources 5.333e+00 6.367e-01 8.377 < 2e-16 ***
## Schooling 6.609e-01 4.288e-02 15.414 < 2e-16 ***
## logGDP 4.139e-01 6.244e-02 6.629 4.02e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.058 on 2911 degrees of freedom
## Multiple R-squared: 0.8194, Adjusted R-squared: 0.8184
## F-statistic: 825.5 on 16 and 2911 DF, p-value: < 2.2e-16
down = summary(stepdown)
down
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP, data = cleandataMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2742 -2.1492 0.0321 2.3594 15.6889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.185e+01 6.220e-01 83.361 < 2e-16 ***
## Adult.Mortality -2.025e-02 7.902e-04 -25.627 < 2e-16 ***
## infant.deaths 1.006e-01 8.266e-03 12.170 < 2e-16 ***
## Alcohol 1.005e-01 2.411e-02 4.167 3.17e-05 ***
## percentage.expenditure 2.118e-04 4.730e-05 4.477 7.85e-06 ***
## Measles -2.239e-05 7.651e-06 -2.926 0.00346 **
## BMI 3.972e-02 4.959e-03 8.009 1.66e-15 ***
## under.five.deaths -7.472e-02 6.116e-03 -12.217 < 2e-16 ***
## Polio 2.716e-02 4.437e-03 6.120 1.06e-09 ***
## Total.expenditure 8.195e-02 3.403e-02 2.408 0.01611 *
## Diphtheria 3.465e-02 4.426e-03 7.828 6.90e-15 ***
## HIV.AIDS -4.657e-01 1.757e-02 -26.496 < 2e-16 ***
## thinness..1.19.years -8.917e-02 2.387e-02 -3.736 0.00019 ***
## Income.composition.of.resources 5.335e+00 6.366e-01 8.381 < 2e-16 ***
## Schooling 6.613e-01 4.286e-02 15.429 < 2e-16 ***
## logGDP 4.125e-01 6.231e-02 6.619 4.29e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.058 on 2912 degrees of freedom
## Multiple R-squared: 0.8194, Adjusted R-squared: 0.8185
## F-statistic: 880.8 on 15 and 2912 DF, p-value: < 2.2e-16
both = summary(stepboth)
both
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP, data = cleandataMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2742 -2.1492 0.0321 2.3594 15.6889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.185e+01 6.220e-01 83.361 < 2e-16 ***
## Adult.Mortality -2.025e-02 7.902e-04 -25.627 < 2e-16 ***
## infant.deaths 1.006e-01 8.266e-03 12.170 < 2e-16 ***
## Alcohol 1.005e-01 2.411e-02 4.167 3.17e-05 ***
## percentage.expenditure 2.118e-04 4.730e-05 4.477 7.85e-06 ***
## Measles -2.239e-05 7.651e-06 -2.926 0.00346 **
## BMI 3.972e-02 4.959e-03 8.009 1.66e-15 ***
## under.five.deaths -7.472e-02 6.116e-03 -12.217 < 2e-16 ***
## Polio 2.716e-02 4.437e-03 6.120 1.06e-09 ***
## Total.expenditure 8.195e-02 3.403e-02 2.408 0.01611 *
## Diphtheria 3.465e-02 4.426e-03 7.828 6.90e-15 ***
## HIV.AIDS -4.657e-01 1.757e-02 -26.496 < 2e-16 ***
## thinness..1.19.years -8.917e-02 2.387e-02 -3.736 0.00019 ***
## Income.composition.of.resources 5.335e+00 6.366e-01 8.381 < 2e-16 ***
## Schooling 6.613e-01 4.286e-02 15.429 < 2e-16 ***
## logGDP 4.125e-01 6.231e-02 6.619 4.29e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.058 on 2912 degrees of freedom
## Multiple R-squared: 0.8194, Adjusted R-squared: 0.8185
## F-statistic: 880.8 on 15 and 2912 DF, p-value: < 2.2e-16
fitFull
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP, data = cleandataMedian)
##
## Coefficients:
## (Intercept) Adult.Mortality
## 5.182e+01 -2.026e-02
## infant.deaths Alcohol
## 1.004e-01 1.007e-01
## percentage.expenditure Measles
## 2.117e-04 -2.229e-05
## BMI under.five.deaths
## 3.993e-02 -7.461e-02
## Polio Total.expenditure
## 2.718e-02 8.267e-02
## Diphtheria HIV.AIDS
## 3.461e-02 -4.658e-01
## thinness..1.19.years thinness.5.9.years
## -1.059e-01 1.869e-02
## Income.composition.of.resources Schooling
## 5.333e+00 6.609e-01
## logGDP
## 4.139e-01
olsrr::ols_step_forward_aic(fitFull)
##
## Selection Summary
## ------------------------------------------------------------------------------------------------
## Variable AIC Sum Sq RSS R-Sq Adj. R-Sq
## ------------------------------------------------------------------------------------------------
## Schooling 19396.624 136605.379 128885.381 0.51454 0.51437
## Adult.Mortality 18025.496 184853.664 80637.096 0.69627 0.69606
## HIV.AIDS 17468.139 198876.292 66614.468 0.74909 0.74883
## Diphtheria 17162.353 205523.215 59967.545 0.77413 0.77382
## BMI 16998.736 208821.027 56669.733 0.78655 0.78618
## logGDP 16874.136 211219.069 54271.690 0.79558 0.79516
## Income.composition.of.resources 16792.318 212750.648 52740.112 0.80135 0.80087
## Polio 16749.619 213549.676 51941.083 0.80436 0.80382
## thinness..1.19.years 16718.844 214127.843 51362.917 0.80654 0.80594
## percentage.expenditure 16699.760 214496.365 50994.395 0.80792 0.80727
## Measles 16682.530 214830.184 50660.576 0.80918 0.80846
## Total.expenditure 16675.783 214981.292 50509.468 0.80975 0.80897
## Alcohol 16671.712 215085.919 50404.841 0.81014 0.80930
## ------------------------------------------------------------------------------------------------
#KNN
fit_cont = trainControl(method = "repeatedcv", number = 10, repeats = 1)
set.seed(136)
knnfit = train(Life.expectancy~Adult.Mortality + infant.deaths + Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP, data =cleandataMedian, method = "knn", trControl = fit_cont, tuneGrid = expand.grid(k = c(1:30)))
knnfit
## k-Nearest Neighbors
##
## 2928 samples
## 16 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 2636, 2635, 2633, 2636, 2635, 2636, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 5.488143 0.6935591 3.449756
## 2 4.921379 0.7397969 3.185675
## 3 4.896706 0.7389821 3.174618
## 4 4.905568 0.7367200 3.191503
## 5 4.889514 0.7374605 3.204195
## 6 4.936552 0.7319917 3.237000
## 7 4.973915 0.7277472 3.266920
## 8 4.985925 0.7263364 3.289079
## 9 4.980177 0.7272703 3.295459
## 10 4.986556 0.7263713 3.308424
## 11 5.006264 0.7245177 3.326198
## 12 5.027220 0.7222950 3.341819
## 13 5.046430 0.7202132 3.371293
## 14 5.072814 0.7173954 3.393572
## 15 5.117980 0.7123708 3.425087
## 16 5.141204 0.7100030 3.448251
## 17 5.169575 0.7067867 3.476042
## 18 5.210799 0.7022013 3.500357
## 19 5.225452 0.7006864 3.512528
## 20 5.227905 0.7006563 3.519559
## 21 5.245156 0.6989557 3.544645
## 22 5.268221 0.6963622 3.559881
## 23 5.277116 0.6954608 3.575229
## 24 5.291620 0.6939448 3.591443
## 25 5.304875 0.6926038 3.603206
## 26 5.327564 0.6901494 3.622863
## 27 5.350114 0.6876450 3.639980
## 28 5.362781 0.6865690 3.652476
## 29 5.376651 0.6851425 3.664897
## 30 5.397276 0.6828929 3.684026
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
#World Map, Color plotting
#Creating the World
#library(ggplot2)
#library(tidyverse)
#library(ggthemes)
#world_map = map_data("world") %>% filter(! long > 180)
#countries = world_map %>% distinct(region) %>% rowid_to_column()
#countries %>% ggplot(aes(fill = rowid, map_id = region)) + geom_map(map = world_map) + expand_limits(x = world_map$long, y = world_map$lat) + coord_map("moll") +theme_map()
#Color world plotting
library(ggplot2)
library(tidyverse)
#rename cleandataMedian for the map
dataforcolmap = cleandataMedian
#but first renaming column Country in dataforcolmap
colnames(dataforcolmap)[1] = "region"
view(dataforcolmap)
#getting map data for plotting
mapdata = map_data("world")
view(mapdata)
#joining map data with dataforcolmap
mapdata = left_join(mapdata,dataforcolmap, by = "region")
## Warning in left_join(mapdata, dataforcolmap, by = "region"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 11 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
view(mapdata)
#filtering out NAs for life expectancy , status, Income.composition.of.resources
#Life Exp
mapdata1 = mapdata %>% filter(!is.na(mapdata$Life.expectancy))
#Status
mapdata2 = mapdata %>% filter(!is.na(mapdata$Status))
#Income
mapdata3 = mapdata %>% filter(!is.na(mapdata$Income.composition.of.resources))
#mapping mapdata1 for Life Exp
map1 = ggplot(mapdata1, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Life.expectancy), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Life Expectancy per Country") + scale_fill_gradient(low = "red", high = "yellow") + guides(fill=guide_legend(title="Life Expectancy"))
map1
#mapping mapdata2 for Status
mapStatus = ggplot(mapdata2, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Status, col = "orange"), color = "black") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Country's Status: Developed v. Developing")
mapStatus
#mapping mapdata3 for Income Composition of Resources
mapIncome = ggplot(mapdata3, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Income.composition.of.resources), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Income Composition of Resources per Country") + scale_fill_gradient(low = "red", high = "yellow")+ guides(fill=guide_legend(title="Income Composition of Resources"))
mapIncome
# Using rpart library
#treeimb <- rpart(ExplVar ~ ., data = train)
#pred.treeimb <- predict(treeimb, newdata = test)